In [1]:
import pandas as pd
import numpy as np

# Read Excel File
from itertools import islice
import csv

# Visulisation
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate
import plotly.express as px
from matplotlib import cm
from matplotlib import patches
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category20
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.spatial import ConvexHull
import dexplot as dxp

# Math Helper
import re
import random
import math
//anaconda3/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [2]:
file = csv.reader('downloads/USA_CancerRates_All_ByCounty.csv')

value_by_row = []

with open('downloads/USA_CancerRates_All_ByCounty.csv', encoding='windows-1252') as f:
    r = csv.reader(f)
    for row in islice(r,11, None):
        value_by_row.append(row)
value_by_row

cols_name = value_by_row[0]

values_to_fill = value_by_row[1:len(value_by_row)]

Cancer_rate = pd.DataFrame(values_to_fill,columns=cols_name)

Cancer_rate.head()
Out[2]:
State FIPS Age-AdjustedCANCERcases_per100000 Lower 95% Confidence Interval Upper 95% Confidence Interval Average Annual Count Recent Trend Recent 5-Year Tren in Incidence Rates Lower 95%CI Upper 95% CI
0 US (SEER+NPCR)(1,10) 0 448 447.7 448.4 1638110 falling -0.9 -1.6 -0.1
1 Union County, Florida(6,10) 12125 1135 1063.6 1210.1 208 falling -4.6 -5.1 -4.1
2 Logan County, Nebraska(6,10) 31113 677.2 455 974.3 7 stable 1.2 -41 73.4
3 Powell County, Kentucky(7,8) 21197 646.2 586.6 710.3 92 stable 1.1 -0.4 2.6
4 Taylor County, Kentucky(7,8) 21217 623.3 583.7 665 196 rising 1.4 0.7 2.2
In [3]:
Cancer_rate.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2932 entries, 0 to 2931
Data columns (total 10 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   State                                  2932 non-null   object
 1    FIPS                                  2932 non-null   object
 2   Age-AdjustedCANCERcases_per100000      2932 non-null   object
 3   Lower 95% Confidence Interval          2932 non-null   object
 4   Upper 95% Confidence Interval          2932 non-null   object
 5   Average Annual Count                   2932 non-null   object
 6   Recent Trend                           2932 non-null   object
 7   Recent 5-Year Tren in Incidence Rates  2932 non-null   object
 8   Lower 95%CI                            2932 non-null   object
 9   Upper 95% CI                           2932 non-null   object
dtypes: object(10)
memory usage: 229.2+ KB
In [4]:
# Count the Null Value in each column

fig,(ax1,ax2) = plt.subplots(1,2,figsize=(20,10))

na_counts = Cancer_rate.isna().sum()
color_palette = sns.color_palette()[5]
null_values = sns.barplot(na_counts.index.values, na_counts, color = color_palette,ax=ax1)\
                 .set_title('Null Values Counts')

fig.autofmt_xdate()

msno.matrix(Cancer_rate, figsize=(16, 16), width_ratios=(15, 1),ax=ax2)

# so we can find there is no NA in each column
//anaconda3/lib/python3.7/site-packages/missingno/missingno.py:61: UserWarning:

Plotting a sparkline on an existing axis is not currently supported. To remove this warning, set sparkline=False.

Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a19446e50>
In [5]:
# We can see that 
# there's a lot of different geographic information in first column, 
# therefore, we can try to seperate this column.

State = []
Counties = []
SEER = []
NPCR = []

# I will seperate the first row, because it looks differently from other rows
State.append(Cancer_rate.loc[0,'State'].split('(')[0])
Counties.append('Null')
SEER.append(re.split(r'[(),]',Cancer_rate.loc[0,'State'])[3])
NPCR.append(re.split(r'[(),]',Cancer_rate.loc[0,'State'])[4])

# Then I decide to check whether there are special palces
special_place = []
for i in range(0,len(Cancer_rate)):
    regex = re.findall('(.*)?\s?(.*),\s(.*)',Cancer_rate['State'][i])
    if not regex:
        special_place.append(Cancer_rate['State'].loc[i])
print('These places have different pattern from other places, and they are %s' %special_place)

#  they are District of Columbia, and Puerto Rico

Cancer_rate[Cancer_rate['State'].str.contains('District of Columbia')|\
            Cancer_rate['State'].str.contains('Puerto Rico')]
# we find their indexes are 1947 and 2710

# Then I will seperate other rows firstly

for i in [z for z in range(1,len(Cancer_rate)) if z not in [0,1947,2710]]:
    State.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[1])
    Counties.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[0])
    SEER.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[2])
    NPCR.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[3])
    
# Insert two different places into list

for j in ([1947,2710]):
    State.insert(j,Cancer_rate.loc[j,'State'].split('(')[0])
    Counties.insert(j,'Null')
    SEER.insert(j,re.split(r'[(),]',Cancer_rate.loc[j,'State'])[1])
    NPCR.insert(j,re.split(r'[(),]',Cancer_rate.loc[j,'State'])[2])

Cancer_rate['State'] = State
Cancer_rate['Counties'] = Counties
Cancer_rate['SEER'] = SEER
Cancer_rate['NPCR'] = NPCR

Cancer_rate['State'] = Cancer_rate['State'].apply(lambda x: x.strip())

Cancer_rate.head()
These places have different pattern from other places, and they are ['US (SEER+NPCR)(1,10)', 'District of Columbia(6,10)', 'Puerto Rico(6,10)']
Out[5]:
State FIPS Age-AdjustedCANCERcases_per100000 Lower 95% Confidence Interval Upper 95% Confidence Interval Average Annual Count Recent Trend Recent 5-Year Tren in Incidence Rates Lower 95%CI Upper 95% CI Counties SEER NPCR
0 US 0 448 447.7 448.4 1638110 falling -0.9 -1.6 -0.1 Null 1 10
1 Florida 12125 1135 1063.6 1210.1 208 falling -4.6 -5.1 -4.1 Union County 6 10
2 Nebraska 31113 677.2 455 974.3 7 stable 1.2 -41 73.4 Logan County 6 10
3 Kentucky 21197 646.2 586.6 710.3 92 stable 1.1 -0.4 2.6 Powell County 7 8
4 Kentucky 21217 623.3 583.7 665 196 rising 1.4 0.7 2.2 Taylor County 7 8
In [6]:
# Then I still need to double check
# 1. all categorical variable should not contain numerical number
# 2. all numerical column should only contain number

# 1.
cat_count = {}
cat_list = ['State','Recent Trend','Counties']
for i in cat_list:
    cat_count['{0}_count'.format(i)] = 0
    for value in Cancer_rate[i]:
        if not value.isnumeric():
            cat_count['{0}_count'.format(i)] += 1
print('The number of character in each columns: \n')
print(tabulate(pd.DataFrame(cat_count.items(), columns=['variable_name', 'count'])))

# so all categorical variables do not have any number

# 2.
num_list = [i for i in Cancer_rate.columns if i not in cat_list]
num_df = pd.DataFrame([],columns=['all_number','index','count'],index=num_list)
for value in num_list:
    num_df.loc[value,'all_number']= pd.to_numeric(Cancer_rate[value].str.lstrip('-'), errors='coerce').notnull().all()
    if num_df.loc[value,'all_number'] == True:
        num_df.loc[value,'index'] = 0
    if num_df.loc[value,'all_number'] == False:
        index = []
        for i in range(0,2932):
            if Cancer_rate.loc[i,value].lstrip('-').replace('.','',1).isdigit() == False:
                index.append(i)
            num_df.loc[value,'index'] = index
            num_df.loc[value,'count'] = len(index)
num_df = num_df.fillna({'count':0})
print("\nNumber of not numerical values in each columns: \n")
display(num_df)          
The number of character in each columns: 

-  ------------------  ----
0  State_count         2932
1  Recent Trend_count  2932
2  Counties_count      2932
-  ------------------  ----

Number of not numerical values in each columns: 

all_number index count
FIPS True 0 0
Age-AdjustedCANCERcases_per100000 False [68, 134, 269, 324, 391, 654, 666, 870, 873, 9... 80
Lower 95% Confidence Interval True 0 0
Upper 95% Confidence Interval True 0 0
Average Annual Count True 0 0
Recent 5-Year Tren in Incidence Rates False [272, 339, 410, 543, 584, 1193, 1407, 1623, 16... 12
Lower 95%CI False [272, 339, 410, 543, 584, 1193, 1407, 1623, 16... 12
Upper 95% CI False [272, 339, 410, 543, 584, 1193, 1407, 1623, 16... 12
SEER True 0 0
NPCR False [272, 339, 410, 543, 584, 1193, 1407, 1623, 16... 11
In [7]:
# then we randomly select several rows to double check
print(Cancer_rate.loc[1407])
print('------------')
print(Cancer_rate.loc[324])
print('------------')
print(Cancer_rate.loc[543])

# we can find except Age-Adjusted column,
# all other four columns are missed in the same row.
# there should be some relationship between Recent 5-year Trend and NPCR
State                                          Georgia
 FIPS                                            13301
Age-AdjustedCANCERcases_per100000                457.1
Lower 95% Confidence Interval                    388.9
Upper 95% Confidence Interval                      535
Average Annual Count                                36
Recent Trend                                         *
Recent 5-Year Tren in Incidence Rates                *
Lower 95%CI                                          *
Upper 95% CI                                         *
Counties                                 Warren County
SEER                                                 7
NPCR                                                  
Name: 1407, dtype: object
------------
State                                          Michigan
 FIPS                                             26165
Age-AdjustedCANCERcases_per100000             510.1 #  
Lower 95% Confidence Interval                     479.3
Upper 95% Confidence Interval                     542.4
Average Annual Count                                219
Recent Trend                                     stable
Recent 5-Year Tren in Incidence Rates              -1.3
Lower 95%CI                                        -4.4
Upper 95% CI                                          2
Counties                                 Wexford County
SEER                                                  6
NPCR                                                 10
Name: 324, dtype: object
------------
State                                          Georgia
 FIPS                                            13211
Age-AdjustedCANCERcases_per100000                494.9
Lower 95% Confidence Interval                    454.7
Upper 95% Confidence Interval                      538
Average Annual Count                               119
Recent Trend                                         *
Recent 5-Year Tren in Incidence Rates                *
Lower 95%CI                                          *
Upper 95% CI                                         *
Counties                                 Morgan County
SEER                                                 7
NPCR                                                  
Name: 543, dtype: object
In [8]:
# Then I find some columns' names are too long
# I decide to shorten them before data cleaning

column_names = Cancer_rate.columns

Cancer_rate.rename(columns={column_names[2]:'Cases_Per100000',column_names[3]:'Cases_Per100000_LCI',column_names[4]:'Cases_Per100000_UCI',\
                            column_names[7]:'Trend_in_5_year',column_names[8]:'Trend_LCI',column_names[9]:'Trend_UCI'},inplace=True)

Cancer_rate.head()
Out[8]:
State FIPS Cases_Per100000 Cases_Per100000_LCI Cases_Per100000_UCI Average Annual Count Recent Trend Trend_in_5_year Trend_LCI Trend_UCI Counties SEER NPCR
0 US 0 448 447.7 448.4 1638110 falling -0.9 -1.6 -0.1 Null 1 10
1 Florida 12125 1135 1063.6 1210.1 208 falling -4.6 -5.1 -4.1 Union County 6 10
2 Nebraska 31113 677.2 455 974.3 7 stable 1.2 -41 73.4 Logan County 6 10
3 Kentucky 21197 646.2 586.6 710.3 92 stable 1.1 -0.4 2.6 Powell County 7 8
4 Kentucky 21217 623.3 583.7 665 196 rising 1.4 0.7 2.2 Taylor County 7 8
In [10]:
# Then I will clean Cases_Per100000 firstly (original name is Age-AdjustedCANCERcases_per100000)
pattern = '[\d+.\d\s\b\s]'
age_count = 0
for i in num_df.iloc[1,1]:
    if bool(re.match(pattern, Cancer_rate.loc[i,'Cases_Per100000'])) == True:
        age_count += 1
print(age_count == num_df.iloc[1,2])

# Therefore, the different values are following same pattern
# we can change them at one time

for j in num_df.iloc[1,1]:
    Cancer_rate.loc[j,'Cases_Per100000'] = Cancer_rate.loc[j,'Cases_Per100000'].split(' ')[0]
True
In [11]:
for k in num_df.iloc[5,1]:
    print(Cancer_rate.loc[k])
State                         Georgia
 FIPS                           13159
Cases_Per100000                 515.7
Cases_Per100000_LCI             466.8
Cases_Per100000_UCI             568.7
Average Annual Count               88
Recent Trend                        *
Trend_in_5_year                     *
Trend_LCI                           *
Trend_UCI                           *
Counties                Jasper County
SEER                                7
NPCR                                 
Name: 272, dtype: object
State                             Georgia
 FIPS                               13303
Cases_Per100000                     508.8
Cases_Per100000_LCI                 469.7
Cases_Per100000_UCI                 550.5
Average Annual Count                  131
Recent Trend                            *
Trend_in_5_year                         *
Trend_LCI                               *
Trend_UCI                               *
Counties                Washington County
SEER                                    7
NPCR                                     
Name: 339, dtype: object
State                         Georgia
 FIPS                           13237
Cases_Per100000                 503.6
Cases_Per100000_LCI             467.3
Cases_Per100000_UCI             542.2
Average Annual Count              161
Recent Trend                        *
Trend_in_5_year                     *
Trend_LCI                           *
Trend_UCI                           *
Counties                Putnam County
SEER                                7
NPCR                                 
Name: 410, dtype: object
State                         Georgia
 FIPS                           13211
Cases_Per100000                 494.9
Cases_Per100000_LCI             454.7
Cases_Per100000_UCI               538
Average Annual Count              119
Recent Trend                        *
Trend_in_5_year                     *
Trend_LCI                           *
Trend_UCI                           *
Counties                Morgan County
SEER                                7
NPCR                                 
Name: 543, dtype: object
State                         Georgia
 FIPS                           13133
Cases_Per100000                 492.9
Cases_Per100000_LCI             452.6
Cases_Per100000_UCI             536.3
Average Annual Count              136
Recent Trend                        *
Trend_in_5_year                     *
Trend_LCI                           *
Trend_UCI                           *
Counties                Greene County
SEER                                7
NPCR                                 
Name: 584, dtype: object
State                             Utah
 FIPS                            49009
Cases_Per100000                  464.8
Cases_Per100000_LCI              326.1
Cases_Per100000_UCI              652.1
Average Annual Count                 8
Recent Trend                         *
Trend_in_5_year                      *
Trend_LCI                            *
Trend_UCI                            *
Counties                Daggett County
SEER                                 7
NPCR                                  
Name: 1193, dtype: object
State                         Georgia
 FIPS                           13301
Cases_Per100000                 457.1
Cases_Per100000_LCI             388.9
Cases_Per100000_UCI               535
Average Annual Count               36
Recent Trend                        *
Trend_in_5_year                     *
Trend_LCI                           *
Trend_UCI                           *
Counties                Warren County
SEER                                7
NPCR                                 
Name: 1407, dtype: object
State                            Georgia
 FIPS                              13163
Cases_Per100000                    447.9
Cases_Per100000_LCI                406.7
Cases_Per100000_UCI                492.4
Average Annual Count                  92
Recent Trend                           *
Trend_in_5_year                        *
Trend_LCI                              *
Trend_UCI                              *
Counties                Jefferson County
SEER                                   7
NPCR                                    
Name: 1623, dtype: object
State                           Georgia
 FIPS                             13125
Cases_Per100000                   446.9
Cases_Per100000_LCI               353.1
Cases_Per100000_UCI               559.3
Average Annual Count                 17
Recent Trend                          *
Trend_in_5_year                       *
Trend_LCI                             *
Trend_UCI                             *
Counties                Glascock County
SEER                                  7
NPCR                                   
Name: 1646, dtype: object
State                          Georgia
 FIPS                            13141
Cases_Per100000                  392.3
Cases_Per100000_LCI              343.5
Cases_Per100000_UCI              446.9
Average Annual Count                51
Recent Trend                         *
Trend_in_5_year                      *
Trend_LCI                            *
Trend_UCI                            *
Counties                Hancock County
SEER                                 7
NPCR                                  
Name: 2487, dtype: object
State                             Georgia
 FIPS                               13265
Cases_Per100000                     361.5
Cases_Per100000_LCI                 265.9
Cases_Per100000_UCI                 487.9
Average Annual Count                   11
Recent Trend                            *
Trend_in_5_year                         *
Trend_LCI                               *
Trend_UCI                               *
Counties                Taliaferro County
SEER                                    7
NPCR                                     
Name: 2718, dtype: object
State                             Texas
 FIPS                             48311
Cases_Per100000                   305.7
Cases_Per100000_LCI               178.9
Cases_Per100000_UCI               503.5
Average Annual Count                  4
Recent Trend                          *
Trend_in_5_year                       *
Trend_LCI                             *
Trend_UCI                             *
Counties                McMullen County
SEER                                  6
NPCR                                 10
Name: 2877, dtype: object
In [12]:
# Because Director Ami does not allow us to use other data
# so I decide to drop these rows with '*'
# let me introduce the meaning of this '*' firstly

# '*' means: 
# Data has been suppressed to ensure confidentiality and stability of rate and trend estimates.

# although I will not fill this row, I still find the report from Georgia
# the report's website is attached
# https://statecancerprofiles.cancer.gov/ratetrendbycancer/index.php?cancer=001&sex=1&stateFIPS=13&comparison=00&type=rtcancer

Cancer_rate = Cancer_rate[Cancer_rate['Recent Trend'] != '*'].reset_index(drop=True)
Cancer_rate.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2920 entries, 0 to 2919
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   State                 2920 non-null   object
 1    FIPS                 2920 non-null   object
 2   Cases_Per100000       2920 non-null   object
 3   Cases_Per100000_LCI   2920 non-null   object
 4   Cases_Per100000_UCI   2920 non-null   object
 5   Average Annual Count  2920 non-null   object
 6   Recent Trend          2920 non-null   object
 7   Trend_in_5_year       2920 non-null   object
 8   Trend_LCI             2920 non-null   object
 9   Trend_UCI             2920 non-null   object
 10  Counties              2920 non-null   object
 11  SEER                  2920 non-null   object
 12  NPCR                  2920 non-null   object
dtypes: object(13)
memory usage: 296.7+ KB
In [13]:
# And then we can change column dtypes

# to begin with, we need to check whether all value is numeric in our selected columns that will be transfered to 'float'
categorical_column = ['State','Recent Trend','Counties']

for i in Cancer_rate.columns:
    if i in categorical_column:
        Cancer_rate[i] = Cancer_rate[i].astype('category')
    else:
        Cancer_rate[i] = Cancer_rate[i].astype('float')
        
print(Cancer_rate.dtypes)
State                   category
 FIPS                    float64
Cases_Per100000          float64
Cases_Per100000_LCI      float64
Cases_Per100000_UCI      float64
Average Annual Count     float64
Recent Trend            category
Trend_in_5_year          float64
Trend_LCI                float64
Trend_UCI                float64
Counties                category
SEER                     float64
NPCR                     float64
dtype: object
In [14]:
cases100000_mean = pd.DataFrame(Cancer_rate.groupby('State')['Cases_Per100000'].mean()).reset_index(level=0)

mean_cases_per100000 = cases100000_mean['Cases_Per100000'].mean()

fig = plt.figure(figsize=(20,20))
sns.set(font_scale=1.3)
sns.set_style("whitegrid")
p = sns.barplot(y='State',x='Cases_Per100000',data=cases100000_mean,palette="Set3")
p.axvline(mean_cases_per100000, ls='-',color='cyan',lw=5)

bbox_props = dict(boxstyle="larrow,pad=0.5", fc="cyan", ec="b", lw=3)
t = p.text(550, 24, "Line represented the mean value in the US", ha="center", va="center",
            size=15,
            bbox=bbox_props)

plt.title('Mean Cancer Cases Per 100000 people in each state')

fig.autofmt_xdate()

# we can find half states are cross the mean line
In [18]:
count = Cancer_rate.groupby('State').count().sort_values(by='Cases_Per100000',ascending=False).iloc[:,1]

output_notebook()

source = ColumnDataSource(data=dict(states=count.index.tolist(), counts=count.values))
palette = list(Category20.values())[16] + list(Category20.values())[15]
p = figure(x_range=count.index.tolist(), plot_height=300, title="Counts of each states")
p.vbar(x='states', top='counts', width=0.9, source=source,
       line_color='white', fill_color=factor_cmap('states', palette=palette, factors=count.index.tolist()))
p.xaxis.major_label_orientation = math.pi/2
show(p)
Loading BokehJS ...
//anaconda3/lib/python3.7/site-packages/bokeh/models/mappers.py:147: UserWarning:

Palette length does not match number of factors. ['New Jersey', 'Nevada', 'Maine', 'Arizona', 'Massachusetts', 'Vermont', 'New Hampshire', 'Connecticut', 'Rhode Island', 'Hawaii', 'Delaware', 'Puerto Rico', 'District of Columbia', 'US'] will be assigned to `nan_color` gray

In [15]:
# In order to see name clearly
# I abbreviate the state name

plt.figure(figsize=(40,40))
copy_cancer = Cancer_rate.copy()
abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

copy_cancer['State'] = copy_cancer['State'].map(abbrev)

# just select 30 states
sample_list = random.sample(copy_cancer['State'].tolist(),30)

copy_cancer.groupby('State')
b = dxp.aggplot(agg='State',data=copy_cancer[copy_cancer['State'].isin(sample_list)],hue='Recent Trend',normalize='State',orient='h',figsize=(40,40))
b.tick_params(labelsize=30)

b.legend(loc=4,prop={'size':50})

fig.autofmt_xdate()
<Figure size 2880x2880 with 0 Axes>
In [16]:
# Then I will draw two different types of plot
# 1. count of falling is larger than count of stable
# 2. count of rising is larger than count of stable

trend_count = copy_cancer.groupby(['Recent Trend','State'])['Recent Trend'].count().unstack().transpose().rename(columns=str).reset_index()

trend_count['falling - stable'] = trend_count['falling'] - trend_count['stable']

trend_count['rising_or_not'] = [i if i > 0  else 0 for i in trend_count['rising']]

def color_negative_red(value):
  """
  Colors elements in a dateframe
  green if positive and red if
  negative. Does not color NaN
  values.
  """

  if value < 0:
    color = 'red'
  elif value > 0:
    color = 'green'
  else:
    color = 'yellow'

  return 'background-color: %s' % color

trend_count.style.applymap(color_negative_red, subset=['falling - stable','rising_or_not'])
Out[16]:
Recent Trend State falling rising stable falling - stable rising_or_not
0 AK 1 0 24 -23 0
1 AL 1 3 63 -62 3
2 AR 1 3 71 -70 3
3 AZ 1 0 14 -13 0
4 CA 39 0 19 20 0
5 CO 5 0 58 -53 0
6 CT 5 0 3 2 0
7 DC 1 0 0 1 0
8 DE 0 0 3 -3 0
9 FL 8 1 58 -50 1
10 GA 21 8 120 -99 8
11 HI 1 0 3 -2 0
12 IA 6 7 86 -80 7
13 ID 13 0 30 -17 0
14 IL 3 4 95 -92 4
15 IN 4 3 85 -81 3
16 KY 11 7 102 -91 7
17 LA 13 2 49 -36 2
18 MA 14 0 0 14 0
19 MD 2 0 22 -20 0
20 ME 1 0 15 -14 0
21 MI 4 0 79 -75 0
22 MO 3 3 109 -106 3
23 MS 3 2 77 -74 2
24 MT 2 1 53 -51 1
25 NC 3 3 94 -91 3
26 ND 2 0 49 -47 0
27 NE 0 4 86 -86 4
28 NH 0 0 10 -10 0
29 NJ 15 0 6 9 0
30 NM 16 1 16 0 1
31 NV 1 0 16 -15 0
32 NY 21 0 41 -20 0
33 OH 1 3 84 -83 3
34 OK 4 1 72 -68 1
35 OR 2 0 34 -32 0
36 PA 3 0 64 -61 0
37 PR 0 0 1 -1 0
38 RI 1 0 4 -3 0
39 SC 6 0 40 -34 0
40 SD 2 0 63 -61 0
41 TN 2 1 92 -90 1
42 TX 20 6 220 -200 6
43 UT 4 0 24 -20 0
44 VA 4 4 125 -121 4
45 VT 1 0 13 -12 0
46 WA 11 0 28 -17 0
47 WI 1 2 69 -68 2
48 WV 0 2 53 -53 2
49 WY 1 1 21 -20 1
In [19]:
# For the EDA part, I will pick 4 states
# two of them should have the most cases per 10000 people
# and two of them should have the least cases per 10000 people
# and all of them should at least 40 rows data in order to make generally large sample size
state_larger_than_40 = count[count>40].index.tolist()
subset_larger_than_40 = Cancer_rate.query("State in @state_larger_than_40").reset_index(drop=True)

subset_larger_than_40.groupby('State')['Cases_Per100000'].mean().nlargest(3)
Out[19]:
State
Kentucky    526.522500
New York    501.574194
Illinois    486.338235
Name: Cases_Per100000, dtype: float64
In [20]:
groupby_state = subset_larger_than_40.groupby('State')['Cases_Per100000'].mean().nsmallest(3)
groupby_state
Out[20]:
State
Colorado      368.446032
Texas         400.554472
California    411.403448
Name: Cases_Per100000, dtype: float64
In [21]:
# Then I decide to do EDA for Texas, Georgia, Idaho and South Carolina

EDA_subset = Cancer_rate[Cancer_rate['State'].isin(['Illinois','California','Texas','New York','Colorado','Kentucky'])].reset_index(drop=True)
EDA_subset
Out[21]:
State FIPS Cases_Per100000 Cases_Per100000_LCI Cases_Per100000_UCI Average Annual Count Recent Trend Trend_in_5_year Trend_LCI Trend_UCI Counties SEER NPCR
0 Kentucky 21197.0 646.2 586.6 710.3 92.0 stable 1.1 -0.4 2.6 Powell County 7.0 8.0
1 Kentucky 21217.0 623.3 583.7 665.0 196.0 rising 1.4 0.7 2.2 Taylor County 7.0 8.0
2 Kentucky 21201.0 611.6 486.8 761.3 19.0 stable 2.2 -1.0 5.6 Robertson County 7.0 8.0
3 Kentucky 21181.0 603.0 532.0 681.5 55.0 stable 1.4 -0.5 3.3 Nicholas County 7.0 8.0
4 Kentucky 21065.0 597.4 547.0 651.6 110.0 stable -7.8 -19.5 5.6 Estill County 7.0 8.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
646 Texas 48271.0 256.9 197.2 331.1 14.0 stable 14.4 -31.8 91.6 Kinney County 6.0 10.0
647 Texas 48229.0 251.9 184.6 336.3 10.0 stable 0.1 -35.5 55.2 Hudspeth County 6.0 10.0
648 Colorado 8117.0 241.0 212.1 272.7 70.0 stable -6.8 -14.9 2.1 Summit County 6.0 10.0
649 Texas 48243.0 237.1 173.2 330.1 11.0 stable -2.1 -41.5 63.6 Jeff Davis County 6.0 10.0
650 Texas 48109.0 218.1 149.1 310.8 7.0 stable 17.1 -31.6 100.7 Culberson County 6.0 10.0

651 rows × 13 columns

In [22]:
sns.set_style('white')

categories = np.unique(EDA_subset['State'])
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]

fig = plt.figure(figsize=(16,10))

for i,category in enumerate(categories):
    plt.scatter('Trend_in_5_year','Cases_Per100000',data=EDA_subset.loc[EDA_subset.State==category,:],
                cmap=colors[i],s=60,
                label=str(category),edgecolors='black',linewidths=0.5)
    
def encircle(x,y, ax=None, **kw):

    if not ax: 
        
        ax=plt.gca()

    p = np.c_[x,y]

    hull = ConvexHull(p)

    poly = plt.Polygon(p[hull.vertices,:], **kw)

    ax.add_patch(poly)
    
encircle_data = EDA_subset.loc[EDA_subset.State.isin(['Illinois','Kentucky','New York']),:]
encircle_data = encircle_data.drop(encircle_data['Trend_in_5_year'].idxmax())
encircle_data = encircle_data.drop(encircle_data['Trend_in_5_year'].idxmin())


x = encircle_data.Trend_in_5_year

y = encircle_data.Cases_Per100000

encircle(x,y, ec="k", fc="gold", alpha=0.1)

encircle(x,y, ec="firebrick", fc="none", linewidth=1.5)

plt.gca().set(xlabel='Trend in 5 years', ylabel='Cases Per 100000 people')

plt.xticks(fontsize=12)

plt.yticks(fontsize=12)

plt.title("Distribution of Cases Per 100000 and Trend in 5 years", fontsize=22)

plt.legend(fontsize=15)    

plt.show()   

# we can find the data for three states with larger mean value of cases_per10000 are included in small polygon
In [23]:
# we also can draw the distribution plot for these states

plt.figure(figsize=(20,16),dpi=40)

states = ['Illinois','New York','Kentucky','California','Texas','Colorado']
colors = ['darkorange','darkgoldenrod','olive','darkslateblue','rebeccapurple','palevioletred']

for color,state in zip(colors,states):
    sns.set(font_scale=2)
    sns.kdeplot(EDA_subset.loc[EDA_subset['State'] == state,'Cases_Per100000'],shade=True,color=color,label=state,alpha=0.5)
    
leg = plt.legend()
for line in leg.get_lines():
    line.set_linewidth(10)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.title('Density Plot for Cases Per 100000',fontsize=20)

# we can find each states followed normal distribution
# and we can split them by this black line
Out[23]:
Text(0.5, 1.0, 'Density Plot for Cases Per 100000')
In [24]:
# draw the boxplot with lower bound and upper bound for Cancer Cases

def plotly_errorbars(state):
    dataset = EDA_subset[EDA_subset['State'] == state].sample(n=20)

    dataset['Cases_UCI'] = dataset['Cases_Per100000_UCI'] - dataset['Cases_Per100000']
    dataset['Cases_LCI'] = dataset['Cases_Per100000'] - dataset['Cases_Per100000_LCI']
    dataset['Trend_UCI_diff'] = dataset['Trend_UCI'] - dataset['Trend_in_5_year']
    dataset['Trend_LCI_diff'] = dataset['Trend_in_5_year'] - dataset['Trend_LCI']
    fig1 = px.scatter(dataset, x="Counties", y="Cases_Per100000",color='Trend_in_5_year',
                 error_y="Cases_LCI", error_y_minus="Cases_UCI")
    fig1.data[0].update(mode='markers',marker=dict(size=10))
    
    trace = fig1['data'][0]
    
    return trace,dataset

fig = make_subplots(
    rows=2, cols=2, subplot_titles=("Kentucky", "New York", "Texas", "California")
)

    

trace1,dataset1 = plotly_errorbars('Kentucky')
Kentucky = dataset1


trace2,dataset2 = plotly_errorbars('New York')
New_York = dataset2


trace3,dataset3 = plotly_errorbars('Texas')
Texas = dataset3


trace4,dataset4 = plotly_errorbars('California')
California = dataset4


fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=60),
    paper_bgcolor="LightSteelBlue",
    width = 800,
    height = 1200
    )

fig.add_trace(trace1,row=1,col=1)
fig.add_trace(go.Scatter(
    x = Kentucky['Counties'],
    y = Kentucky['Cases_Per100000']),row=1,col=1)
fig.add_trace(trace2,row=1,col=2)
fig.add_trace(go.Scatter(
    x = New_York['Counties'],
    y = New_York['Cases_Per100000']),row=1,col=2)
fig.add_trace(trace3,row=2,col=1)
fig.add_trace(go.Scatter(
    x = Texas['Counties'],
    y = Texas['Cases_Per100000']),row=2,col=1)
fig.add_trace(trace4,row=2,col=2)
fig.add_trace(go.Scatter(
    x = California['Counties'],
    y = California['Cases_Per100000']),row=2,col=2)

fig.layout.update(showlegend=False) 

fig.show()
In [158]:
# draw the boxplot with lower bound and upper bound for Recent Trend

fig,ax = plt.subplots(2,2,figsize=(50,50))
states = [New_York,California,Kentucky,Texas]
state_names = ['New_York','California','Kentucky','Texas']
for i,dataset in enumerate(states[0:2]):
    plt.subplot(2,2,i+1)
    norm = plt.Normalize(dataset['Trend_in_5_year'].min(), dataset['Trend_in_5_year'].max())
    cmap = plt.get_cmap("BrBG")
    plt.bar(x="Counties", height="Trend_in_5_year", yerr=[dataset['Trend_LCI_diff'],dataset['Trend_UCI_diff']],data=dataset, color=cmap(norm(dataset['Trend_in_5_year'].values)))

    sm = cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])

    cbar = plt.colorbar(sm)
    plt.xticks(rotation=90)
    plt.title(state_names[i],fontsize=40)
    
for i,dataset in enumerate(states[2:4]):
    plt.subplot(2,2,i+3)
    norm = plt.Normalize(dataset['Trend_in_5_year'].min(), dataset['Trend_in_5_year'].max())
    cmap = plt.get_cmap("PiYG")
    plt.bar(x="Counties", height="Trend_in_5_year", yerr=[dataset['Trend_LCI_diff'],dataset['Trend_UCI_diff']],data=dataset, color=cmap(norm(dataset['Trend_in_5_year'].values)))

    sm = cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])

    cbar = plt.colorbar(sm)
    plt.xticks(rotation=90)
    plt.title(state_names[i+2],fontsize=40)
In [25]:
Cancer_rate.to_csv('Cancer_rate_cleaned.csv',index=False)